library(dplyr)
library(ggfortify)
library(tidyverse)
library(factoextra)
library(cluster)
library(mclust)
__ __
____ ___ _____/ /_ _______/ /_
/ __ `__ \/ ___/ / / / / ___/ __/
/ / / / / / /__/ / /_/ (__ ) /_
/_/ /_/ /_/\___/_/\__,_/____/\__/ version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.
Attaching package: ‘mclust’
The following object is masked from ‘package:purrr’:
map
Consider data set lab15.
lab15 = read_delim("lab15.csv", delim = ",")
Rows: 120 Columns: 2
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (2): x, y
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Use k-means clustering with 2 clusters (with other parameters having default values) to cluster the data.
set.seed(20241212)
ex1.1 = kmeans(lab15, 2)
ex1.1
K-means clustering with 2 clusters of sizes 58, 62
Cluster means:
x y
1 2.385596 -1.421775
2 2.876544 2.760950
Clustering vector:
[1] 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 1 1 2 1 1 2 2 2 1 1 2 1 1 2
[70] 2 2 1 1 2 1 1 2 2 2 1 1 2 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 1 1 2 2 2 1 1 2 1 1
Within cluster sum of squares by cluster:
[1] 297.9516 408.2059
(between_SS / total_SS = 42.9 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter" "ifault"
Produce corresponding graph with factoextra::fviz_cluster().
ex1.1$tot.withinss
[1] 706.1574
What is the value of the objective function (in the component tot.withinss)
ex1.1$tot.withinss
Repeat the previous step until you find a different clustering of data. Visualize it. Which of the two clustering results is better?
# Repeat clustering to find a different clustering
kmeans_2_diff <- kmeans(lab15, centers = 2, nstart = 20)
# Visualize the new clustering
fviz_cluster(kmeans_2_diff, data = lab15)
# Compare objective function values
kmeans_2_diff$tot.withinss
[1] 656.0803
Find the values of tot.withinss for 2,3,…,10 clusters (using nstart=20 to find a good clustering). Use elbow method to find a good candidate for the total number of clusters to use.
# Compute tot.withinss for k = 2 to 10
tot_withinss <- map_dbl(2:10, function(k) {
kmeans(lab15, centers = k, nstart = 20)$tot.withinss
})
# Elbow plot
plot(2:10, tot_withinss, type = "b", pch = 19, col = "blue",
xlab = "Number of Clusters", ylab = "Total Within-cluster Sum of Squares")
Visualize the results for the best number of clusters.
# Repeat clustering to find a different clustering
kmeans_4_diff <- kmeans(lab15, centers = 4, nstart = 20)
# Visualize the new clustering
fviz_cluster(kmeans_4_diff, data = lab15)
kmeans_4_diff$tot.withinss
[1] 221.7143
Compute average silhouette values for 2,…,10 clusters and produce plot of average silhouette values. Which number of clusters gives the maximal value?
silhouette_avg <- map_dbl(2:10, function(k) {
km <- kmeans(lab15, centers = k, nstart = 20)
sil <- silhouette(km$cluster, dist(lab15))
mean(sil[, 3])
})
# Plot silhouette values
plot(2:10, silhouette_avg, type = "b", pch = 19, col = "red",
xlab = "Number of Clusters", ylab = "Average Silhouette Width")
Consider data set USArrests. Find the best clustering (based on elbow method) for the dataset without scaling variables.
# K-means clustering (elbow method)
tot_withinss_usa <- map_dbl(2:10, function(k) {
kmeans(USArrests, centers = k, nstart = 20)$tot.withinss
})
plot(2:10, tot_withinss_usa, type = "b", pch = 19, col = "blue",
xlab = "Number of Clusters", ylab = "Total Within-cluster Sum of Squares")
# Visualization with respect to Murder and Assault
best_k <- 3
km_usarrests <- kmeans(USArrests, centers = best_k, nstart = 20)
fviz_cluster(km_usarrests, data = USArrests, choose.vars = c("Murder", "Assault"))
Visualize the results with respect to Murder and Assault variables.
# Visualization with respect to Murder and Assault
best_k <- 3
km_usarrests <- kmeans(USArrests, centers = best_k, nstart = 20)
fviz_cluster(km_usarrests, data = USArrests, choose.vars = c("Murder", "Assault"))
Find the best clustering for the scaled data set. Which clusters would you consider to be more meaningful?
# Scale data
usarrests_scaled <- scale(USArrests)
# K-means clustering on scaled data
tot_withinss_scaled <- map_dbl(2:10, function(k) {
kmeans(usarrests_scaled, centers = k, nstart = 20)$tot.withinss
})
plot(2:10, tot_withinss_scaled, type = "b", pch = 19, col = "blue",
xlab = "Number of Clusters", ylab = "Total Within-cluster Sum of Squares")
# Best clustering
best_k_scaled <- 4
km_scaled <- kmeans(usarrests_scaled, centers = best_k_scaled, nstart = 20)
fviz_cluster(km_scaled, data = usarrests_scaled)
Hierarchical clustering In the case of Hierarchical clustering we get sequence of nested clusters and we have to decide, how many clusters to use. For deciding which number of clusters to use, we can look at dendogram (at which height there is the largest gap with no additional clusters appearing) or to use some measure like silhouette for choosing a good number of clusters.The result depends heavily on the dissimilarity measure used.
Consider data set lab15. Let us use the Euclidean distance (which can be computed by dist() function) for the dissimilarity measure.
Apply hierarchical clustering to the data set with complete linkage and look at the dendrogram. Can you spot a good number of clusters from it?
# Compute distance matrix and perform clustering
dist_lab15 <- dist(lab15)
hclust_complete <- hclust(dist_lab15, method = "complete")
# Plot dendrogram
plot(hclust_complete, labels = FALSE, main = "Complete Linkage Dendrogram")
rect.hclust(hclust_complete, k = 3, border = "red")
Visualize the resulting clusters.
clusters_hc <- cutree(hclust_complete, k = 3)
fviz_cluster(list(data = lab15, cluster = clusters_hc))
Compute average silhouettes for cluster sizes 2,3,…,10. Which number of clusters gives the highest value?
silhouette_avg_hc <- map_dbl(2:10, function(k) {
clusters <- cutree(hclust_complete, k = k)
sil <- silhouette(clusters, dist_lab15)
mean(sil[, 3])
})
Visualize the resulting clusters for the chosen number of clusters.
# Plot silhouette values
plot(2:10, silhouette_avg_hc, type = "b", pch = 19, col = "red",
xlab = "Number of Clusters", ylab = "Average Silhouette Width")
best_k_hc <- 4
clusters_hc <- cutree(hclust_complete, k = best_k_hc)
fviz_cluster(list(data = lab15, cluster = clusters_hc))
Consider the data set ISLR::NCI60. Let us use hierarchical clustering with average linkage.
Find the best number of clusters for the data based on average silhouette when scaled data and Euclidean distance is used. Use table(cancer_types,assigned_clusters) to see if some cancer types correspond well to the discovered clusters.
nci_scaled <- scale(ISLR::NCI60$data)
# Perform hierarchical clustering
dist_nci <- dist(nci_scaled)
hc_avg <- hclust(dist_nci, method = "average")
# Silhouette method
silhouette_avg_nci <- map_dbl(2:10, function(k) {
clusters <- cutree(hc_avg, k = k)
sil <- silhouette(clusters, dist_nci)
mean(sil[, 3])
})
plot(2:10, silhouette_avg_nci, type = "b", pch = 19, col = "red",
xlab = "Number of Clusters", ylab = "Average Silhouette Width")
best_k_hc <- 2
clusters_hc <- cutree(hc_avg, k = best_k_hc)
fviz_cluster(list(data = nci_scaled, cluster = clusters_hc))
Repeat the previous step in the case of correlation based dissimilarity matrix (defined by as.dist(1-cor(t(data_df))))). Which approach seems to be better?
# Correlation-based distance
dist_corr <- as.dist(1 - cor(t(nci_scaled)))
# Perform hierarchical clustering
hc_corr <- hclust(dist_corr, method = "average")
# Silhouette method
silhouette_avg_corr <- map_dbl(2:10, function(k) {
clusters <- cutree(hc_corr, k = k)
sil <- silhouette(clusters, dist_corr)
mean(sil[, 3])
})
plot(2:10, silhouette_avg_corr, type = "b", pch = 19, col = "blue",
xlab = "Number of Clusters", ylab = "Average Silhouette Width")
best_k_hc <- 8
clusters_hc <- cutree(dist_corr, k = best_k_hc)
Error in tree$merge : $ operator is invalid for atomic vectors
Clustering by Gaussian Mixture model The idea of Gaussian mixture distribution is to assume that data is a random sample from objects of different types/classes and that observation values in each class are normally distributed. It is possible to allow different degrees for flexibility for distributions of different types of observations - can only means be different or can there be some kind of differences in the covariance matrices also. The command Mclust(), when used with default parameters, chooses the flexibility of the distributions and the number of clusters by using BIC.
Consider again the dataset lab15.
Use MClust() to determine the best number of clusters. Look also at the plot of BIC values (those are negative of usual BIC values we have used before) by using the plot() command with the parameter what=“BIC”. Visualize the clustering results by fviz_cluster() and also by plot() with the option what=“classification”. Look also at the estimated probabilities of the 49th observation to belong to different clusters. Which are the two most probable clusters for this observation? Hint: the cluster probabilities can be obtained by predict() command.
# Fit Gaussian mixture model
gmm_lab15 <- Mclust(lab15)
fitting ...
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|=== | 2%
|
|==== | 3%
|
|===== | 4%
|
|====== | 5%
|
|======= | 6%
|
|======== | 6%
|
|========= | 7%
|
|========== | 8%
|
|=========== | 9%
|
|============ | 9%
|
|============== | 10%
|
|=============== | 11%
|
|================ | 12%
|
|================= | 13%
|
|================== | 13%
|
|=================== | 14%
|
|==================== | 15%
|
|===================== | 16%
|
|====================== | 17%
|
|======================= | 17%
|
|======================== | 18%
|
|========================= | 19%
|
|========================== | 20%
|
|=========================== | 20%
|
|============================ | 21%
|
|============================= | 22%
|
|============================== | 23%
|
|=============================== | 24%
|
|================================ | 24%
|
|================================= | 25%
|
|================================== | 26%
|
|=================================== | 27%
|
|==================================== | 28%
|
|===================================== | 28%
|
|====================================== | 29%
|
|======================================= | 30%
|
|========================================= | 31%
|
|========================================== | 31%
|
|=========================================== | 32%
|
|============================================ | 33%
|
|============================================= | 34%
|
|============================================== | 35%
|
|=============================================== | 35%
|
|================================================ | 36%
|
|================================================= | 37%
|
|================================================== | 38%
|
|=================================================== | 39%
|
|==================================================== | 39%
|
|===================================================== | 40%
|
|====================================================== | 41%
|
|======================================================= | 42%
|
|======================================================== | 43%
|
|========================================================= | 43%
|
|========================================================== | 44%
|
|=========================================================== | 45%
|
|============================================================ | 46%
|
|============================================================= | 46%
|
|============================================================== | 47%
|
|=============================================================== | 48%
|
|================================================================ | 49%
|
|================================================================= | 50%
|
|=================================================================== | 50%
|
|==================================================================== | 51%
|
|===================================================================== | 52%
|
|====================================================================== | 53%
|
|======================================================================= | 54%
|
|======================================================================== | 54%
|
|========================================================================= | 55%
|
|========================================================================== | 56%
|
|=========================================================================== | 57%
|
|============================================================================ | 57%
|
|============================================================================= | 58%
|
|============================================================================== | 59%
|
|=============================================================================== | 60%
|
|================================================================================ | 61%
|
|================================================================================= | 61%
|
|================================================================================== | 62%
|
|=================================================================================== | 63%
|
|==================================================================================== | 64%
|
|===================================================================================== | 65%
|
|====================================================================================== | 65%
|
|======================================================================================= | 66%
|
|======================================================================================== | 67%
|
|========================================================================================= | 68%
|
|========================================================================================== | 69%
|
|=========================================================================================== | 69%
|
|============================================================================================= | 70%
|
|============================================================================================== | 71%
|
|=============================================================================================== | 72%
|
|================================================================================================ | 72%
|
|================================================================================================= | 73%
|
|================================================================================================== | 74%
|
|=================================================================================================== | 75%
|
|==================================================================================================== | 76%
|
|===================================================================================================== | 76%
|
|====================================================================================================== | 77%
|
|======================================================================================================= | 78%
|
|======================================================================================================== | 79%
|
|========================================================================================================= | 80%
|
|========================================================================================================== | 80%
|
|=========================================================================================================== | 81%
|
|============================================================================================================ | 82%
|
|============================================================================================================= | 83%
|
|============================================================================================================== | 83%
|
|=============================================================================================================== | 84%
|
|================================================================================================================ | 85%
|
|================================================================================================================= | 86%
|
|================================================================================================================== | 87%
|
|=================================================================================================================== | 87%
|
|==================================================================================================================== | 88%
|
|===================================================================================================================== | 89%
|
|====================================================================================================================== | 90%
|
|======================================================================================================================== | 91%
|
|========================================================================================================================= | 91%
|
|========================================================================================================================== | 92%
|
|=========================================================================================================================== | 93%
|
|============================================================================================================================ | 94%
|
|============================================================================================================================= | 94%
|
|============================================================================================================================== | 95%
|
|=============================================================================================================================== | 96%
|
|================================================================================================================================ | 97%
|
|================================================================================================================================= | 98%
|
|================================================================================================================================== | 98%
|
|=================================================================================================================================== | 99%
|
|====================================================================================================================================| 100%
# BIC plot
plot(gmm_lab15, what = "BIC")
# Visualize clustering
fviz_cluster(gmm_lab15)
plot(gmm_lab15, what = "classification")
# Probabilities for the 49th observation
predicted_probs <- predict(gmm_lab15)
predicted_probs$z[49, ] # Probabilities for observation 49
1 2 3 4 5
5.210224e-01 5.749146e-04 5.785793e-06 4.783969e-01 8.839710e-10